query <- "
SELECT
  commit.repo_name AS repo_name,
  commit.file_name AS file_name,
  commit.path AS path,
  commit.sha AS sha,
  commit.init_commit_timestamp AS init_commit,
  loc.language AS language
FROM
  [github-bioinformatics-171721:repos.file_init_commit] AS commit
LEFT JOIN
  [github-bioinformatics-171721:analysis.lines_of_code_by_file] AS loc
ON
  commit.sha = loc.sha
WHERE
  loc.language IS NOT NULL
"
data_all_lang <- query_exec(query, project = proj, max_pages = Inf) %>% 
  select(language, init_commit) %>%
  mutate(init_commit = parse_iso_8601(init_commit)) %>%
  mutate(year = cut(as.Date(init_commit), breaks = "year")) %>%
  select(year, language) %>%
  mutate(year = as.numeric(substr(year, 1, 4)))
## 0 bytes processed
total_files_per_year <- data_all_lang %>% 
  select(year) %>% 
  group_by(year) %>% 
  summarize(total_files_year = n())

data <- data_all_lang %>%
  filter(language %in% top_langs) %>%
  group_by(year, language) %>%
  summarise(num_files = n()) %>%
  arrange(year) %>%
  left_join(total_files_per_year, by = "year") %>%
  mutate(pct_total_files_year = num_files / total_files_year) %>%
  mutate(year_fmt = substr(year, 1, 4))
ggplot(data %>% filter(year_fmt >= 2010),
       aes(x = year_fmt,
           y = pct_total_files_year,
           colour = language,
           group = language)) + 
  geom_line() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        axis.title.x = element_text(size = 18),
        axis.title.y = element_text(size = 18),
        plot.title = element_text(size = 24)) +
  xlab("Year") +
  ylab("Percentage of total files created") + 
  ggtitle("Language use over time") +
  scale_color_discrete("Language")

plot_lang_pct <- function(lang) {
  ggplot(data %>% filter(year_fmt >= 2010 & language == lang),
         aes(x = year_fmt,
             y = pct_total_files_year,
             group = language)) + 
    geom_line() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          axis.title.x = element_text(size = 18),
          axis.title.y = element_text(size = 18),
          plot.title = element_text(size = 24)) +
    xlab("Year") +
    ylab("Percentage of total files created") + 
    ggtitle(paste("Percentage of total files created:", lang))
}

plot_lang_total <- function(lang) {
  ggplot(data %>% filter(year_fmt >= 2010 & language == lang),
         aes(x = year_fmt,
             y = num_files,
             group = language)) + 
    geom_line() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          axis.title.x = element_text(size = 18),
          axis.title.y = element_text(size = 18),
          plot.title = element_text(size = 24)) +
    xlab("Year") +
    ylab("Total files created") + 
    ggtitle(paste("Number of files created:", lang))
}

for(lang in top_langs) {
  print(plot_lang_total(lang))
  print(plot_lang_pct(lang))
}